rm(list=ls())
set.seed(7)
setwd("C:/Users/Student/SBD2/SBD2_Group12")
data_loans <- read_csv("loan_sample_12.csv")
## Rows: 40000 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): grade, home_ownership, verification_status, purpose, application_type
## dbl (12): loan_amnt, int_rate, annual_inc, dti, open_acc, revol_bal, revol_u...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Copy dataset
data <- data_loans
data <- data.frame(data)
For this we use the function str(). We see, that there are 40’000 observations of 17 variables.
## 'data.frame': 40000 obs. of 17 variables:
## $ loan_amnt : num 3200 1200 20000 3125 4200 ...
## $ int_rate : num 11 11.4 13.1 15.6 13.7 ...
## $ grade : chr "B" "B" "B" "C" ...
## $ home_ownership : chr "RENT" "MORTGAGE" "MORTGAGE" "RENT" ...
## $ annual_inc : num 35422 100000 74686 18000 70100 ...
## $ verification_status: chr "Not Verified" "Not Verified" "Verified" "Verified" ...
## $ purpose : chr "debt_consolidation" "major_purchase" "credit_card" "debt_consolidation" ...
## $ dti : num 14.03 19.64 17.95 29.13 5.17 ...
## $ open_acc : num 12 6 13 17 10 8 18 10 19 12 ...
## $ revol_bal : num 3906 15257 29312 10689 6292 ...
## $ revol_util : num 34.9 96.6 81.2 54.8 37 47.7 32.4 74.7 65.9 33.4 ...
## $ total_acc : num 29 9 24 29 26 24 38 16 32 25 ...
## $ total_rec_int : num 519.9 75.2 3993.5 814.9 225.1 ...
## $ application_type : chr "Individual" "Individual" "Individual" "Individual" ...
## $ tot_cur_bal : num 45459 62751 65669 17382 334523 ...
## $ total_rev_hi_lim : num 11200 15800 36100 19500 17000 39000 34500 18500 15500 33500 ...
## $ Status : num 0 0 0 0 1 0 0 0 0 0 ...
For this we use the function overview(). We see, that there are 12 numerical and 5 categorical variables.
overview(data)
## division metrics value
## 1 size observations 40000
## 2 size variables 17
## 3 size values 680000
## 4 size memory size 5444352
## 5 duplicated duplicate observation 0
## 6 missing complete observation 40000
## 7 missing missing observation 0
## 8 missing missing variables 0
## 9 missing missing values 0
## 10 data type numerics 12
## 11 data type integers 0
## 12 data type factors/ordered 0
## 13 data type characters 5
## 14 data type Dates 0
## 15 data type POSIXcts 0
## 16 data type others 0
To Summarize the variables we use the function summary(). Looking at the variables we can see that the mean of most variables is close to their median, indicating a low ocurrence of outliers. The only exception here is the variable “tot_cur_bal” which has a big difference between mean and median, indicating a higher occurence of outliers.
summary(data)
## loan_amnt int_rate grade home_ownership
## Min. : 1000 Min. : 5.31 Length:40000 Length:40000
## 1st Qu.: 7000 1st Qu.: 9.44 Class :character Class :character
## Median :10075 Median :12.29 Mode :character Mode :character
## Mean :11675 Mean :12.65
## 3rd Qu.:15000 3rd Qu.:15.05
## Max. :40000 Max. :27.49
## annual_inc verification_status purpose dti
## Min. : 7000 Length:40000 Length:40000 Min. : 0.00
## 1st Qu.: 42000 Class :character Class :character 1st Qu.:12.13
## Median : 57000 Mode :character Mode :character Median :17.63
## Mean : 63397 Mean :18.24
## 3rd Qu.: 77000 3rd Qu.:23.93
## Max. :400000 Max. :57.90
## open_acc revol_bal revol_util total_acc
## Min. : 1.00 Min. : 0 Min. : 0.00 Min. : 3.00
## 1st Qu.: 8.00 1st Qu.: 5629 1st Qu.: 34.80 1st Qu.:15.00
## Median :10.00 Median : 9821 Median : 52.30 Median :20.00
## Mean :10.31 Mean :11989 Mean : 52.16 Mean :21.28
## 3rd Qu.:13.00 3rd Qu.:15832 3rd Qu.: 69.80 3rd Qu.:27.00
## Max. :23.00 Max. :78762 Max. :123.20 Max. :57.00
## total_rec_int application_type tot_cur_bal total_rev_hi_lim
## Min. : 0.0 Length:40000 Min. : 0 Min. : 300
## 1st Qu.: 678.6 Class :character 1st Qu.: 25217 1st Qu.: 13000
## Median :1346.1 Mode :character Median : 53723 Median : 20900
## Mean :1819.8 Mean : 99244 Mean : 24208
## 3rd Qu.:2428.2 3rd Qu.:158466 3rd Qu.: 32200
## Max. :8834.9 Max. :472573 Max. :100000
## Status
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.1297
## 3rd Qu.:0.0000
## Max. :1.0000
In the next step, we investigate our target variable. We notice that in our sample, we have 34’813 companies which did not default on their loan and we have 5’187 which did default.
PercTable(data$Status)
##
## freq perc
##
## 0 34'813 87.0%
## 1 5'187 13.0%
ggplot(data, aes(x = Status, fill = Status)) +
geom_bar(fill = "steelblue",
color = "grey") +
ylab("Count") +
xlab("Status")
The target variable is not balanced, therefore we are going to undersample it.
set.seed(7)
data_balanced <- ovun.sample(Status ~ ., data=data, method = "under")
data_under <- data.frame(data_balanced[["data"]])
##
## freq perc
##
## 0 5'188 50.0%
## 1 5'187 50.0%
We investigated the occurances of outliers by creating a boxplot for every feature.
#Dimensions of new set
dim(data_under)
## [1] 10375 17
#Numeric Variables
data_under_num <- data_under %>%
select_if(is.numeric)
data_under_num <- as.data.frame(data_under_num)
dim(data_under_num)
## [1] 10375 12
#Categorical Variables
data_under_cat <- data_under %>%
select_if(is.character)
data_under_cat <- as.data.frame(data_under_cat)
dim(data_under_cat)
## [1] 10375 5
boxplot(scale(data_under_num), main = "Boxplot with outliers", xaxt = "n", col = "steelblue")
text(x = 1:length(data_under_num),
y = par("usr")[3] - 0.8,
labels = names(data_under_num),
xpd = NA,
## Rotate the labels by 35 degrees.
srt = 35,
cex = 0.8,
adj = 1)
#Diagnose Outliers
diagnose_outlier(data_under_num)
## variables outliers_cnt outliers_ratio outliers_mean with_mean
## 1 loan_amnt 226 2.1783133 32598.11947 1.186074e+04
## 2 int_rate 187 1.8024096 25.85684 1.356186e+01
## 3 annual_inc 325 3.1325301 163412.27692 6.100994e+04
## 4 dti 24 0.2313253 47.52042 1.896574e+01
## 5 open_acc 77 0.7421687 21.45455 1.031422e+01
## 6 revol_bal 461 4.4433735 38873.62473 1.182393e+04
## 7 revol_util 0 0.0000000 NaN 5.330123e+01
## 8 total_acc 49 0.4722892 49.20408 2.090737e+01
## 9 total_rec_int 599 5.7734940 6561.78888 1.843341e+03
## 10 tot_cur_bal 352 3.3927711 367913.71023 9.163402e+04
## 11 total_rev_hi_lim 274 2.6409639 67766.16058 2.321985e+04
## 12 Status 0 0.0000000 NaN 4.999518e-01
## without_mean
## 1 1.139896e+04
## 2 1.333619e+01
## 3 5.769842e+04
## 4 1.889953e+01
## 5 1.023092e+01
## 6 1.056612e+04
## 7 5.330123e+01
## 8 2.077310e+01
## 9 1.554230e+03
## 10 8.193129e+04
## 11 2.201148e+04
## 12 4.999518e-01
diagnose_numeric(data_under_num)
## # A tibble: 12 × 10
## variables min Q1 mean median Q3 max zero minus outlier
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <int> <int>
## 1 loan_amnt 1 e3 6.6 e3 1.19e+4 1.01e4 1.57e4 4 e4 0 0 226
## 2 int_rate 5.31e0 1.05e1 1.36e+1 1.31e1 1.62e1 2.75e1 0 0 187
## 3 annual_inc 7.68e3 4 e4 6.10e+4 5.5 e4 7.5 e4 3.85e5 0 0 325
## 4 dti 0 1.28e1 1.90e+1 1.84e1 2.48e1 5.77e1 4 0 24
## 5 open_acc 1 e0 8 e0 1.03e+1 1 e1 1.3 e1 2.2 e1 0 0 77
## 6 revol_bal 0 5.53e3 1.18e+4 9.70e3 1.56e4 7.88e4 29 0 461
## 7 revol_util 0 3.62e1 5.33e+1 5.36e1 7.13e1 1.21e2 33 0 0
## 8 total_acc 3 e0 1.4 e1 2.09e+1 2 e1 2.7 e1 5.7 e1 0 0 49
## 9 total_rec_int 0 6.77e2 1.84e+3 1.33e3 2.48e3 8.83e3 42 0 599
## 10 tot_cur_bal 0 2.36e4 9.16e+4 4.77e4 1.42e5 4.64e5 3 0 352
## 11 total_rev_hi_… 5 e2 1.24e4 2.32e+4 2 e4 3.07e4 1 e5 0 0 274
## 12 Status 0 0 5.00e-1 0 1 e0 1 e0 5188 0 0
We can now visualize all columns with and without outliers
#Visualize With And Without Outliers
data_under_num %>%
plot_outlier(col="steelblue", diagnose_outlier(data_under_num) %>%
filter(outliers_ratio >= 0.5) %>% # dplyr
select(variables) %>%
unlist())
We see that without the outliers, the standard distribution is better almost everywhere. So now we create the standard function to replace the outliers with the 5th and 95th percentile values of the respective feature.
# Define Outlier functions
#Cap
outlier_cap <- function(x){
quantiles <- quantile(x, c(.05, .95))
x[x < quantiles[1]] <- quantiles[1]
x[x > quantiles[2]] <- quantiles[2]
x
}
data_under_num_capped <- map_df(data_under_num, outlier_cap)
boxplot(scale(data_under_num_capped), xaxt = "n", col="steelblue")
text(x = 1:length(data_under_num_capped),
y = par("usr")[3] - 0.8,
labels = names(data_under_num_capped),
xpd = NA,
## Rotate the labels by 35 degrees.
srt = 35,
cex = 0.8,
adj = 1)
# New data set with capped outliers
data_under_capped <- cbind(data_under_num_capped, data_under_cat)
#Looks good !
#impute only single cols
diagnose_numeric(data_under_num)
## # A tibble: 12 × 10
## variables min Q1 mean median Q3 max zero minus outlier
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <int> <int>
## 1 loan_amnt 1 e3 6.6 e3 1.19e+4 1.01e4 1.57e4 4 e4 0 0 226
## 2 int_rate 5.31e0 1.05e1 1.36e+1 1.31e1 1.62e1 2.75e1 0 0 187
## 3 annual_inc 7.68e3 4 e4 6.10e+4 5.5 e4 7.5 e4 3.85e5 0 0 325
## 4 dti 0 1.28e1 1.90e+1 1.84e1 2.48e1 5.77e1 4 0 24
## 5 open_acc 1 e0 8 e0 1.03e+1 1 e1 1.3 e1 2.2 e1 0 0 77
## 6 revol_bal 0 5.53e3 1.18e+4 9.70e3 1.56e4 7.88e4 29 0 461
## 7 revol_util 0 3.62e1 5.33e+1 5.36e1 7.13e1 1.21e2 33 0 0
## 8 total_acc 3 e0 1.4 e1 2.09e+1 2 e1 2.7 e1 5.7 e1 0 0 49
## 9 total_rec_int 0 6.77e2 1.84e+3 1.33e3 2.48e3 8.83e3 42 0 599
## 10 tot_cur_bal 0 2.36e4 9.16e+4 4.77e4 1.42e5 4.64e5 3 0 352
## 11 total_rev_hi_… 5 e2 1.24e4 2.32e+4 2 e4 3.07e4 1 e5 0 0 274
## 12 Status 0 0 5.00e-1 0 1 e0 1 e0 5188 0 0
imp_income <- imputate_outlier(data_under, annual_inc, method = "median")
summary(imp_income)
## Impute outliers with median
##
## * Information of Imputation (before vs after)
## Original Imputation
## described_variables "value" "value"
## n "10375" "10375"
## na "0" "0"
## mean "61009.94" "57613.89"
## sd "30223.72" "22941.54"
## se_mean "296.7248" "225.2312"
## IQR "35000" "30000"
## skewness "2.0331164" "0.7244068"
## kurtosis "8.6939744" "0.1248179"
## p00 "7680" "7680"
## p01 "19472.44" "19472.44"
## p05 "26000" "26000"
## p10 "30551.2" "30551.2"
## p20 "38000" "38000"
## p25 "40000" "40000"
## p30 "44000" "44000"
## p40 "50000" "50000"
## p50 "55000" "55000"
## p60 "60000" "60000"
## p70 "70000" "65000"
## p75 "75000" "70000"
## p80 "80000" "75000"
## p90 "99270" "90000"
## p95 "115060.0" "102550.9"
## p99 "168000" "120000"
## p100 "385000" "127500"
for (i in 1:length(data_under_capped[,-c(12:17)])) {
print(ggplot(data_under, aes(y = data_under_capped[,i], color = Status)) +
geom_boxplot(fill = "steelblue",
color = "black") +
ylab(names(data_under_capped[i])) +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())+
labs(title = "Distribution",
y = names(data_under_capped[i])))}
ggplot(data_under_capped,
aes(x = Status,
fill = grade)) +
geom_bar(position = "stack") +
scale_fill_brewer()
ggplot(data_under_capped,
aes(x = Status,
fill = home_ownership)) +
geom_bar(position = "stack") +
scale_fill_brewer()
ggplot(data_under_capped,
aes(x = Status,
fill = verification_status)) +
geom_bar(position = "stack")+
scale_fill_brewer()
ggplot(data_under_capped,
aes(x = Status,
fill = purpose)) +
geom_bar(position = "stack")
ggplot(data_under_capped,
aes(x = Status,
fill = application_type)) +
geom_bar(position = "stack")+
scale_fill_brewer()
In the next step, we look at the correlations that emerge between the variables
clean_num <- data.frame(data_under_num_capped)
corr <- cor(clean_num)
p_value_mat <- cor_pmat(clean_num)
ggcorrplot(corr, type = "lower", p.mat = p_value_mat, ggtheme = ggplot2::theme_gray, colors = c("red", "white", "steelblue"))
visualisation <- ggplot(data,
aes(x = loan_amnt,
y = annual_inc)) +
geom_point(color = "steelblue") +
geom_smooth(method = "lm",
se = F)
#interactive
ggplotly(visualisation)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain, y_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
set.seed(7)
data_balanced <- ovun.sample(Status ~ ., data=data, method = "under")
data_under <- data.frame(data_balanced[["data"]])
barplot(table(data_under$Status), col="steelblue")
To get the relevant variables we use the Boruta Algo function.
clean <- data.frame(data_under_capped)
clean_num <- data.frame(data_under_num_capped)
clean$Status <- as.factor(clean$Status)
#----takes 10-15min!!!!------
boruta_output <- Boruta(Status~., data = clean, doTrace=2)
## 1. run of importance source...
## 2. run of importance source...
## 3. run of importance source...
## 4. run of importance source...
## 5. run of importance source...
## 6. run of importance source...
## 7. run of importance source...
## 8. run of importance source...
## 9. run of importance source...
## 10. run of importance source...
## 11. run of importance source...
## After 11 iterations, +2.3 mins:
## confirmed 12 attributes: annual_inc, dti, grade, home_ownership, int_rate and 7 more;
## rejected 1 attribute: application_type;
## still have 3 attributes left.
## 12. run of importance source...
## 13. run of importance source...
## 14. run of importance source...
## 15. run of importance source...
## After 15 iterations, +3 mins:
## confirmed 1 attribute: total_acc;
## still have 2 attributes left.
## 16. run of importance source...
## 17. run of importance source...
## 18. run of importance source...
## 19. run of importance source...
## 20. run of importance source...
## 21. run of importance source...
## 22. run of importance source...
## 23. run of importance source...
## 24. run of importance source...
## 25. run of importance source...
## 26. run of importance source...
## 27. run of importance source...
## 28. run of importance source...
## 29. run of importance source...
## 30. run of importance source...
## 31. run of importance source...
## 32. run of importance source...
## 33. run of importance source...
## 34. run of importance source...
## 35. run of importance source...
## 36. run of importance source...
## 37. run of importance source...
## 38. run of importance source...
## 39. run of importance source...
## 40. run of importance source...
## 41. run of importance source...
## 42. run of importance source...
## 43. run of importance source...
## 44. run of importance source...
## 45. run of importance source...
## 46. run of importance source...
## 47. run of importance source...
## 48. run of importance source...
## 49. run of importance source...
## 50. run of importance source...
## 51. run of importance source...
## 52. run of importance source...
## 53. run of importance source...
## 54. run of importance source...
## 55. run of importance source...
## 56. run of importance source...
## 57. run of importance source...
## 58. run of importance source...
## After 58 iterations, +11 mins:
## confirmed 1 attribute: verification_status;
## still have 1 attribute left.
## 59. run of importance source...
## 60. run of importance source...
## 61. run of importance source...
## 62. run of importance source...
## 63. run of importance source...
## 64. run of importance source...
## 65. run of importance source...
## 66. run of importance source...
## 67. run of importance source...
## 68. run of importance source...
## 69. run of importance source...
## 70. run of importance source...
## 71. run of importance source...
## 72. run of importance source...
## 73. run of importance source...
## 74. run of importance source...
## 75. run of importance source...
## 76. run of importance source...
## 77. run of importance source...
## 78. run of importance source...
## 79. run of importance source...
## 80. run of importance source...
## 81. run of importance source...
## 82. run of importance source...
## 83. run of importance source...
## 84. run of importance source...
## 85. run of importance source...
## 86. run of importance source...
## 87. run of importance source...
## 88. run of importance source...
## 89. run of importance source...
## 90. run of importance source...
## 91. run of importance source...
## 92. run of importance source...
## 93. run of importance source...
## 94. run of importance source...
## 95. run of importance source...
## 96. run of importance source...
## 97. run of importance source...
## 98. run of importance source...
## 99. run of importance source...
boruta_signif <- getSelectedAttributes(boruta_output, withTentative = TRUE)
print(boruta_signif)
## [1] "loan_amnt" "int_rate" "annual_inc"
## [4] "dti" "open_acc" "revol_bal"
## [7] "revol_util" "total_acc" "total_rec_int"
## [10] "tot_cur_bal" "total_rev_hi_lim" "grade"
## [13] "home_ownership" "verification_status" "purpose"
plot(boruta_output, cex.axis=.7, las=2, xlab="", main="Variable Importance", col="steelblue")
Status <- clean$Status
#Remove useless columns:
features <- cbind(clean[boruta_signif], Status)
set.seed(7)
div <- createDataPartition(y = features$Status, p = 0.8, list = F)
data.train <- features[div,]
PercTable(data.train$Status)
##
## freq perc
##
## 0 4'151 50.0%
## 1 4'150 50.0%
data.test <- features[-div,]
PercTable(data.test$Status)
##
## freq perc
##
## 0 1'037 50.0%
## 1 1'037 50.0%
fit1 <- glm(Status ~ ., data=data.train,family=binomial())
summary(fit1)
##
## Call:
## glm(formula = Status ~ ., family = binomial(), data = data.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2286 -1.0791 -0.4451 1.0602 2.0913
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.970e+00 3.079e-01 -9.645 < 2e-16 ***
## loan_amnt 4.652e-05 6.139e-06 7.578 3.50e-14 ***
## int_rate 1.414e-01 1.647e-02 8.584 < 2e-16 ***
## annual_inc -4.292e-06 1.278e-06 -3.360 0.000781 ***
## dti 2.232e-02 3.604e-03 6.192 5.93e-10 ***
## open_acc 1.828e-02 9.626e-03 1.900 0.057497 .
## revol_bal -3.820e-06 8.840e-06 -0.432 0.665692
## revol_util 1.613e-03 2.079e-03 0.776 0.437746
## total_acc -3.872e-03 3.963e-03 -0.977 0.328539
## total_rec_int -2.198e-04 2.475e-05 -8.881 < 2e-16 ***
## tot_cur_bal -8.647e-07 3.942e-07 -2.193 0.028280 *
## total_rev_hi_lim -2.633e-06 4.756e-06 -0.554 0.579823
## gradeB 3.665e-01 9.967e-02 3.677 0.000236 ***
## gradeC 4.768e-01 1.348e-01 3.537 0.000405 ***
## gradeD 3.978e-01 2.018e-01 1.971 0.048756 *
## home_ownershipOWN -2.716e-02 8.716e-02 -0.312 0.755342
## home_ownershipRENT 1.934e-01 6.551e-02 2.952 0.003158 **
## verification_statusSource Verified 1.075e-01 5.576e-02 1.928 0.053845 .
## verification_statusVerified 8.865e-02 6.459e-02 1.372 0.169923
## purposecredit_card 3.394e-01 2.242e-01 1.514 0.130015
## purposedebt_consolidation 1.546e-01 2.202e-01 0.702 0.482708
## purposehome_improvement 2.963e-01 2.415e-01 1.227 0.219925
## purposehouse -1.772e-01 4.077e-01 -0.435 0.663782
## purposemajor_purchase 2.382e-01 2.740e-01 0.869 0.384829
## purposemedical 3.720e-01 2.929e-01 1.270 0.204109
## purposemoving 9.682e-02 3.792e-01 0.255 0.798476
## purposeother 7.745e-02 2.374e-01 0.326 0.744216
## purposerenewable_energy 1.233e+00 9.138e-01 1.349 0.177349
## purposesmall_business 8.719e-01 3.430e-01 2.542 0.011012 *
## purposevacation 5.147e-01 3.537e-01 1.455 0.145671
## purposewedding -3.004e-01 5.864e-01 -0.512 0.608455
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 11508 on 8300 degrees of freedom
## Residual deviance: 10456 on 8270 degrees of freedom
## AIC: 10518
##
## Number of Fisher Scoring iterations: 4
#only show significant variables:
significant.variables <- summary(fit1)$coeff[-1,4] < 0.05
names(significant.variables)[significant.variables == TRUE]
## [1] "loan_amnt" "int_rate" "annual_inc"
## [4] "dti" "total_rec_int" "tot_cur_bal"
## [7] "gradeB" "gradeC" "gradeD"
## [10] "home_ownershipRENT" "purposesmall_business"
#ROC
data.test$fit1_score <- predict(fit1,type='response',data.test)
fit1_pred <- prediction(data.test$fit1_score, data.test$Status)
fit1_roc <- performance(fit1_pred, "tpr", "fpr")
plot(fit1_roc, lwd=1, colorize = TRUE, main = "Fit1: Logit - ROC Curve")
lines(x=c(0, 1), y=c(0, 1), col="black", lwd=1, lty=3)
#precision/ recall curve
fit1_precision <- performance(fit1_pred, measure = "prec", x.measure = "rec")
plot(fit1_precision, main="Fit1: Logit - Precision vs Recall")
confusionMatrix(as.factor(round(data.test$fit1_score)), data.test$Status)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 662 356
## 1 375 681
##
## Accuracy : 0.6475
## 95% CI : (0.6265, 0.6681)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.2951
##
## Mcnemar's Test P-Value : 0.5056
##
## Sensitivity : 0.6384
## Specificity : 0.6567
## Pos Pred Value : 0.6503
## Neg Pred Value : 0.6449
## Prevalence : 0.5000
## Detection Rate : 0.3192
## Detection Prevalence : 0.4908
## Balanced Accuracy : 0.6475
##
## 'Positive' Class : 0
##
fit1_auc <- performance(fit1_pred, measure = "auc")
cat("AUC: ",fit1_auc@y.values[[1]]*100)
## AUC: 69.74769
In this case, the AUC value is 0.7, which indicates that the model does not perform very well in discriminating between the positive and negative classes.
## Training and testing without any data quality checks and feature selection
# Split the data
set.seed(7)
data$Status <- as.factor(data$Status)
div_2 <- createDataPartition(y = data$Status, p = 0.7, list = F)
# Training Sample
data.train_2 <- data[div,] # 70% here
# Test Sample
data.test_2 <- data[-div,] # rest of the 30% data goes here
# Fit the model
fit2 <- glm(Status ~ ., data=data.train_2,family=binomial())
summary(fit2)
##
## Call:
## glm(formula = Status ~ ., family = binomial(), data = data.train_2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4738 -0.5732 -0.4379 -0.2898 2.8907
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.159e+00 4.116e-01 -10.105 < 2e-16 ***
## loan_amnt 5.793e-05 8.085e-06 7.166 7.75e-13 ***
## int_rate 9.738e-02 1.765e-02 5.518 3.42e-08 ***
## gradeB 5.836e-01 1.631e-01 3.578 0.000346 ***
## gradeC 8.721e-01 1.906e-01 4.575 4.76e-06 ***
## gradeD 9.602e-01 2.614e-01 3.673 0.000240 ***
## home_ownershipOWN -1.416e-03 1.262e-01 -0.011 0.991045
## home_ownershipRENT 1.983e-01 9.369e-02 2.116 0.034318 *
## annual_inc -3.521e-06 1.573e-06 -2.239 0.025171 *
## verification_statusSource Verified 1.333e-01 8.172e-02 1.631 0.102919
## verification_statusVerified -1.233e-02 9.338e-02 -0.132 0.894964
## purposecredit_card -3.028e-01 3.100e-01 -0.977 0.328742
## purposedebt_consolidation -3.386e-01 3.036e-01 -1.115 0.264744
## purposehome_improvement -4.290e-01 3.418e-01 -1.255 0.209459
## purposehouse -1.639e+00 8.027e-01 -2.042 0.041159 *
## purposemajor_purchase -9.134e-01 4.389e-01 -2.081 0.037423 *
## purposemedical -5.670e-02 4.123e-01 -0.138 0.890617
## purposemoving -7.267e-01 5.382e-01 -1.350 0.176924
## purposeother -5.061e-01 3.283e-01 -1.542 0.123140
## purposerenewable_energy -3.664e-01 1.134e+00 -0.323 0.746640
## purposesmall_business 1.667e-01 4.151e-01 0.402 0.687922
## purposevacation 2.898e-01 4.588e-01 0.632 0.527602
## purposewedding -1.211e+01 2.367e+02 -0.051 0.959187
## dti 1.591e-02 4.791e-03 3.319 0.000902 ***
## open_acc 2.270e-02 1.211e-02 1.874 0.060970 .
## revol_bal -2.024e-06 1.085e-05 -0.187 0.852032
## revol_util 1.076e-03 2.697e-03 0.399 0.689988
## total_acc -2.801e-03 5.195e-03 -0.539 0.589770
## total_rec_int -1.890e-04 2.852e-05 -6.626 3.44e-11 ***
## application_typeJoint App 3.767e-01 2.421e-01 1.556 0.119777
## tot_cur_bal -8.561e-07 5.392e-07 -1.588 0.112310
## total_rev_hi_lim -5.870e-06 6.402e-06 -0.917 0.359160
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6433.3 on 8300 degrees of freedom
## Residual deviance: 5906.7 on 8269 degrees of freedom
## AIC: 5970.7
##
## Number of Fisher Scoring iterations: 12
# Extract and compare the significant variables
significant.variables <- summary(fit2)$coeff[-1,4] < 0.05
names(significant.variables)[significant.variables == TRUE]
## [1] "loan_amnt" "int_rate" "gradeB"
## [4] "gradeC" "gradeD" "home_ownershipRENT"
## [7] "annual_inc" "purposehouse" "purposemajor_purchase"
## [10] "dti" "total_rec_int"
# Test the perfromance
data.test_2$fit2_score <- predict(fit2,type='response',data.test_2)
fit2_pred <- prediction(data.test_2$fit2_score, data.test_2$Status)
fit2_roc <- performance(fit2_pred, "tpr", "fpr")
plot(fit2_roc, lwd=1, colorize = TRUE, main = "Fit2: Logit - ROC Curve")
lines(x=c(0, 1), y=c(0, 1), col="black", lwd=1, lty=3)
# Extract the confusion matrix
confusionMatrix(as.factor(round(data.test_2$fit2_score)), data.test_2$Status)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 27542 4050
## 1 54 53
##
## Accuracy : 0.8705
## 95% CI : (0.8668, 0.8742)
## No Information Rate : 0.8706
## P-Value [Acc > NIR] : 0.5108
##
## Kappa : 0.0187
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.99804
## Specificity : 0.01292
## Pos Pred Value : 0.87180
## Neg Pred Value : 0.49533
## Prevalence : 0.87056
## Detection Rate : 0.86886
## Detection Prevalence : 0.99662
## Balanced Accuracy : 0.50548
##
## 'Positive' Class : 0
##
# Extract the AUC value
fit2_auc <- performance(fit2_pred, measure = "auc")
cat("AUC: ",fit2_auc@y.values[[1]]*100)
## AUC: 68.97785
There are several possible ways to improve the predictive performance of our data. For example the handling of outliers as the way dealt with outliers can have a huge outcome on how the predictive performance of the affected data. Furthermore the usage of more training data will most certainly improve the performance of our mode. Also we could use advanced algorithms such as Random Forests, Boosted Trees, and Support Vector Machines.
In addition to that, the most powerful way to improve the performance would be to use additional data.
The collection of additional Data to provide more training data could be a way to improve our results. In addition to that we could change/play around the way dealt with outliers in our process to test wheter or not the results would improve.
Some examples of additional data could be:
Geographic location of the borrower
Payment history (for existing loans)
Education
The model makes its decisions based on the data provided. As the model decides whether or not a loan contract is defaulted or not based on the data provided, this can become a challenge as the used data may not be accurate or big enough to train the model well enough to be able to make a fair decision if a loan contract is defaulted or not. In addition to that it is questionable if such a decision should be made by a model itself as there may be information that can not be considered by the model. Probably the biggest challenge for a company would be, that at the moment our model is only around 70% accurate. Therefor the model could make wrong decisions leading in to a possible lost of money.
A possible way to mitigate such challenges could for example be to consider if all information needed to make such a decision is available in the data used for a model. Additionally it should be assessed whether or not the decision made by the model should be absoulte or act as a supportive indicator for the company.
## R version 4.2.1 (2022-06-23 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19045)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=German_Switzerland.utf8 LC_CTYPE=German_Switzerland.utf8
## [3] LC_MONETARY=German_Switzerland.utf8 LC_NUMERIC=C
## [5] LC_TIME=German_Switzerland.utf8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] plotly_4.10.0 knitr_1.40
## [3] reshape_0.8.9 networkD3_0.4
## [5] corrr_0.4.4 PerformanceAnalytics_2.0.4
## [7] xts_0.12.2 zoo_1.8-11
## [9] ggcorrplot_0.1.4 RColorBrewer_1.1-3
## [11] forcats_0.5.2 stringr_1.4.1
## [13] purrr_0.3.5 tidyr_1.2.1
## [15] tibble_3.1.8 tidyverse_1.3.2
## [17] ggpubr_0.4.0 DescTools_0.99.46
## [19] corrplot_0.92 ROSE_0.0-4
## [21] dplyr_1.0.10 pROC_1.18.0
## [23] caret_6.0-93 lattice_0.20-45
## [25] ROCR_1.0-11 dlookr_0.6.1
## [27] Boruta_7.0.0 ggplot2_3.4.0
## [29] readr_2.1.3
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.2 tidyselect_1.2.0 htmlwidgets_1.5.4
## [4] ranger_0.14.1 grid_4.2.1 munsell_0.5.0
## [7] codetools_0.2-18 future_1.28.0 withr_2.5.0
## [10] colorspace_2.0-3 highr_0.9 rstudioapi_0.14
## [13] stats4_4.2.1 ggsignif_0.6.4 Rttf2pt1_1.3.11
## [16] listenv_0.8.0 labeling_0.4.2 bit64_4.0.5
## [19] farver_2.1.1 parallelly_1.32.1 vctrs_0.5.0
## [22] generics_0.1.3 ipred_0.9-13 xfun_0.33
## [25] pagedown_0.19 R6_2.5.1 cachem_1.0.6
## [28] assertthat_0.2.1 showtext_0.9-5 promises_1.2.0.1
## [31] scales_1.2.1 vroom_1.6.0 nnet_7.3-17
## [34] googlesheets4_1.0.1 rootSolve_1.8.2.3 gtable_0.3.1
## [37] globals_0.16.1 lmom_2.9 timeDate_4021.106
## [40] rlang_1.0.6 systemfonts_1.0.4 splines_4.2.1
## [43] rstatix_0.7.0 extrafontdb_1.0 lazyeval_0.2.2
## [46] ModelMetrics_1.2.2.2 gargle_1.2.1 broom_1.0.1
## [49] yaml_2.3.5 reshape2_1.4.4 abind_1.4-5
## [52] modelr_0.1.9 crosstalk_1.2.0 backports_1.4.1
## [55] httpuv_1.6.6 extrafont_0.18 inum_1.0-4
## [58] tools_4.2.1 lava_1.6.10 ellipsis_0.3.2
## [61] kableExtra_1.3.4 jquerylib_0.1.4 proxy_0.4-27
## [64] Rcpp_1.0.9 plyr_1.8.7 rpart_4.1.16
## [67] haven_2.5.1 hrbrthemes_0.8.0 fs_1.5.2
## [70] magrittr_2.0.3 data.table_1.14.2 reprex_2.0.2
## [73] googledrive_2.0.0 mvtnorm_1.1-3 hms_1.1.2
## [76] mime_0.12 reactable_0.3.0 evaluate_0.17
## [79] xtable_1.8-4 readxl_1.4.1 gridExtra_2.3
## [82] compiler_4.2.1 crayon_1.5.2 htmltools_0.5.3
## [85] mgcv_1.8-40 later_1.3.0 tzdb_0.3.0
## [88] Formula_1.2-4 libcoin_1.0-9 expm_0.999-6
## [91] Exact_3.2 lubridate_1.8.0 DBI_1.1.3
## [94] dbplyr_2.2.1 MASS_7.3-57 boot_1.3-28
## [97] Matrix_1.5-1 car_3.1-1 cli_3.4.1
## [100] quadprog_1.5-8 parallel_4.2.1 gower_1.0.0
## [103] igraph_1.3.5 pkgconfig_2.0.3 recipes_1.0.2
## [106] xml2_1.3.3 foreach_1.5.2 svglite_2.1.0
## [109] bslib_0.4.0 hardhat_1.2.0 webshot_0.5.4
## [112] prodlim_2019.11.13 rvest_1.0.3 digest_0.6.29
## [115] showtextdb_3.0 rmarkdown_2.17 cellranger_1.1.0
## [118] gld_2.6.5 gdtools_0.2.4 curl_4.3.3
## [121] shiny_1.7.2 lifecycle_1.0.3 nlme_3.1-157
## [124] jsonlite_1.8.2 carData_3.0-5 viridisLite_0.4.1
## [127] fansi_1.0.3 pillar_1.8.1 fastmap_1.1.0
## [130] httr_1.4.4 survival_3.3-1 glue_1.6.2
## [133] iterators_1.0.14 bit_4.0.4 class_7.3-20
## [136] stringi_1.7.8 sass_0.4.2 partykit_1.2-16
## [139] e1071_1.7-11 future.apply_1.9.1 sysfonts_0.8.8